## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 139280: <<Hive;
## 837c7b84c6e897d17eb2fc156fd5dd25b7468053;Navis Ryu <navis@apache.org>;
## 2015-01-21T07:57:16;"hcatalog/src/test/e2e/templeton/inpdir/
## PigJoin\344\266\264\343\204\251\351\274\276\344\270\204\347\213\234\343\200\207work.pig\"";-4>>.
## If the fields are not quoted (e.g. field separator does not appear within
## any field), try quote="" to avoid this warning.
## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 433308:
## <<MariaDB;e4963e7c3762c495d6b67f855e618c3ff8175b7e;"Reggie"/Reggie@xp. <>;
## 2007-06-07T09:13:56;sql-common/client.c;0>>. If the fields are not quoted
## (e.g. field separator does not appear within any field), try quote="" to
## avoid this warning.
## Warning in FUN(X[[i]], ...): Found and resolved improper
## quoting out-of-sample. First healed line 507408:
## <<MySQL;e4963e7c3762c495d6b67f855e618c3ff8175b7e;"Reggie"/Reggie@xp. <>;
## 2007-06-07T09:13:56;sql-common/client.c;0>>. If the fields are not quoted
## (e.g. field separator does not appear within any field), try quote="" to
## avoid this warning.

Alte Auswertungen mit allen Datenbanken

In der alten Auswertung wurden die Files mit den meisten Commits gezeigt. Da dies über alle Datenbanken nur schwer geht habe ich mich für einen Densitiy Plot entschieden. Später folgt noch eine Auswertung mit der am stärksten vertretenen Programmiersprache. Hier ein Density Plot über die Anzahl an Commits per File in einem Repository

Commits per File

extractedTable_CommitCountByFile <- DBData %>%
  group_by(name, file) %>%
    summarise(commitCount = n_distinct(commit))

extMean <- extractedTable_CommitCountByFile %>%
  group_by(name) %>%
  summarise(median = median(commitCount),mean = mean(commitCount))

knitr::kable(extMean)
name median mean
Cassandra 3 11.156520
Couchbase 2 9.998494
CouchDB 3 7.327409
Elasticsearch 3 5.461175
etcd 2 5.012709
Firebird 2 18.172426
Greenplum 2 7.983630
Hazelcast 6 10.883780
HBase 3 8.481786
Hive 2 6.676108
Impala 3 9.729001
InfluxDB 3 7.764769
MariaDB 4 11.512196
Memcached 3 13.112403
MongoDB 2 4.943705
MySQL 3 8.792365
Neo4j 4 7.778134
PostgreSQL 6 23.587189
Presto 3 8.922513
Realm 2 6.935587
Redis 3 11.604444
Solr 3 6.103026
Spark SQL 2 5.338807
Sphinx 3 11.137621
SQLite 5 42.624477
ggplot(extractedTable_CommitCountByFile, aes(x=commitCount,color=name,alpha=0.5))+
geom_density(adjust = 3) +
  xlim(0,max(extMean$mean)) +
 ggtitle("Density of Commits") +
  xlab("CommitCount") + 
  ylab("Part of")
## Warning: Removed 15422 rows containing non-finite values (stat_density).

ggplot(extractedTable_CommitCountByFile, aes(x=commitCount,color=name,alpha=0.5))+
geom_density(position = "stack",adjust = 3) +
  geom_vline(data=extMean,aes(xintercept=mean, color=name),linetype="solid") +
  xlim(0,max(extMean$mean)) +
  geom_label_repel(data=extMean, mapping=aes(x=mean, y=5, label=name), size=4, angle=90, vjust=-0.4, hjust=0) +
  ggtitle("Density of Commits with median as VLine") +
  xlab("CommitCount limited bei maxMedianValue") + 
  ylab("Part of")+ theme(legend.position="none")
## Warning: Removed 15422 rows containing non-finite values (stat_density).

Gruppiert nach Datum werden die eindeutigen CommitIds für diesen Tag aufsummiert

extractedTable <- DBData %>%
  group_by(timestampInDate,name) %>%
    summarise(commitCount = n_distinct(commit))

Erzeugen einer extra Spalte für Monat und Jahr

extractedTable$month = month(extractedTable$timestampInDate)
extractedTable$year = year(extractedTable$timestampInDate)

Erneutes aufsummieren der Commits pro Monat

extractedTableMonth <- extractedTable %>%
  group_by(year,month,name) %>%
    summarise(commits = sum(commitCount))%>%
  filter(!is.na(year)&!is.na(month))

Weiterer Plot als Monatsübersicht.

labels <- extractedTableMonth %>%
  group_by(name)%>%
  summarise(min=min(year))
  
PlotMonthly <-ggplot(extractedTableMonth, aes(y=commits,x=year,month,color=name)) +
  geom_smooth(formula = y~x,method ='auto') +
  geom_label_repel(data=labels, mapping=aes(label=name,x=min,y=-200)) +
  geom_vline(data=labels,aes(xintercept=min, color=name),linetype="solid") +
  ggtitle("Commits over Time (Year,Month)") +  
  xlab("Time") + 
  ylab("Commits")+ theme(legend.position="none")

Angezeigt wird die Anzahl der Commits pro Monat für alle Repositorys

PlotMonthly
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Hier nun aufgeteilt auf einzelne Jahre mit den einzelnen Datenbanken als Anteil

PlotMonthlyFacet <- 
  ggplot(extractedTableMonth, aes(y=commits,x=month,fill=name)) +
      geom_bar(stat = "identity") +
  facet_wrap(~ year, ncol = 5) +ggtitle("Commit over Time (Facet by Year, Columned by Month)") +
            xlab("Time") + ylab("Commits")

PlotMonthlyFacet

Ab Hier Fangen neue Auswertungen an für die Frage der Korrelation zwischen Commits und Changes

Wie groß ist ein File in den DB-Repos ohne Negative Zeilenzahl

loc_table <- DBData %>%
  group_by(name, file) %>%
  summarise(loc = sum(change)) %>%
  filter(loc>0)

PlotFileSizeStacked <- ggplot(loc_table, aes(x=loc,fill = name,alpha=0.5)) + 
  geom_density(position = "stack") +
  xlim(0,500)+
  ggtitle("Dateigröße als DichtePlot Stacked") +
  xlab("Lines of Code") + 
  ylab("Häufigkeit")

Der Plot einmal als Stacked (Ist bisschen übersichtlicher aber die einzelnen Repos verschwimmen etwas)

PlotFileSizeStacked
## Warning: Removed 26896 rows containing non-finite values (stat_density).

PlotFileSize <- ggplot(loc_table, aes(x=loc,color = name,alpha=0.5)) + 
  geom_density() +
  xlim(0,500)+
  ggtitle("Dateigröße als DichtePlot") +
  xlab("Lines of Code") + 
  ylab("Häufigkeit")

Mit den Linien sieht man die einzelnen Plots etwas besser finde ich

PlotFileSize
## Warning: Removed 26896 rows containing non-finite values (stat_density).

Aufgeteilt auf einzelne Facets

Kann man noch etwas besser vergleichen

PlotFileSizeFacedWrap <- ggplot(loc_table, aes(x=loc,fill = name,alpha=0.5)) + 
  geom_density() +
  xlim(0,500)+
  facet_wrap(~name)+
  ggtitle("Dateigröße als DichtePlot aufgeteilt") +  
  xlab("Lines of Code") + 
  ylab("Häufigkeit")

PlotFileSizeFacedWrap
## Warning: Removed 26896 rows containing non-finite values (stat_density).

Durchschnittliche Dateigröße pro Datenbank

file_table_mean <- DBData %>%
  group_by(name,commit) %>%
  summarise(fileSize = sum(change))%>%
   summarise(mean = mean(fileSize)) 
            

PlotMeanFileSize <- ggplot(file_table_mean, aes(x = reorder(name, -mean),y=mean)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Durchschnittliche Dateigröße") +
  ylab("Lines of Code") + 
  xlab("Datenbanken")

PlotMeanFileSize

Commitgröße (Wieviel Files sind in einem Commit)

commit_table <- DBData %>%
  group_by(name,commit)%>%
  summarise(files =n())

PlotCommitSize <- ggplot(commit_table, aes(x=files,color = name,fill=name,alpha=.4)) + 
  geom_density(adjust = 3) +
  xlim(0,15) +
  ggtitle("Commitgröße als DichtePlot") +
  xlab("Commitgröße") + 
  ylab("Anzahl Commits")

PlotCommitSize
## Warning: Removed 43883 rows containing non-finite values (stat_density).

Wieder aufgeteilt in einzelne Facets

PlotCommitSizeFacedWrap <- ggplot(commit_table, aes(x=files,fill = name,alpha=0.5)) + 
  geom_density(adjust = 3) + 
  xlim(0,15)+
  facet_wrap(~name) +
  ggtitle("Commitgröße als DichtePlot") +
  xlab("Commitgröße") + 
  ylab("Häufigkeit")

PlotCommitSizeFacedWrap
## Warning: Removed 43883 rows containing non-finite values (stat_density).

Durchschnittliche Commitgröße pro Datenbank

commit_table_mean <- DBData %>%
  group_by(name,commit) %>%
  summarise(commitSize = n()) %>%
  summarise(mean = mean(commitSize)) 

PlotMeanCommitSize <- ggplot(commit_table_mean, aes(x = reorder(name, -mean),y=mean)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Durchschnittliche Commitgröße") +
  ylab("Commitgröße") + 
  xlab("Datenbanken")

PlotMeanCommitSize

Zusammenhang zwschen durchschnittlicher Commitgröße und Filegröße

commit_file_mean <- commit_table_mean %>%
  inner_join(file_table_mean,by="name")


PlotCompareCommitFileSize <- ggplot(commit_file_mean, aes(x=mean.x, y=mean.y,color=name)) +
  geom_point()+
  geom_label_repel(aes(label=name),hjust=-0.1, vjust=1)+
  ggtitle("Commitgröße - Filegröße") +
  ylab("Filegröße Mean") + 
  xlab("Commitgröße Mean")+ theme(legend.position="none")

PlotCompareCommitFileSize

Zusammenhang zwischen Changes und Commits

change_commit_table <- DBData %>%
  group_by(file,programmingLanguage,name) %>%
  summarise(sum = sum(abs(change)),commitCnt = n()) 

PlotCommitCntFileSizeFacetWrap <- ggplot(change_commit_table, aes(x=sum,y=commitCnt,color=name)) +
  geom_jitter() +
  xlim(0,25000) +
  ylim(0,1000) +
  facet_wrap(~name)+
  ggtitle("Anzahl Commits - Menge Änderungen Pro File") +
  xlab("Änderungen in Zeilen") + 
  ylab("Anzahl Commits")

PlotCommitCntFileSizeFacetWrap
## Warning: Removed 20153 rows containing missing values (geom_point).

Korrelation beider Werte

corr_table <- change_commit_table %>%
  group_by(name) %>%
  summarise(Correlation = cor(sum,commitCnt))


PlotCorrelationCommitFile <- ggplot(corr_table, aes(x=reorder(name,-Correlation),y=Correlation)) +
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Korrelation zwischen Anzahl Zeilen und Anzahl Commits") +
  ylab("Korrelation") + 
  xlab("Datanbank")

PlotCorrelationCommitFile

Selbe Auswertung bezogen auf die am meisten verwendete Programmiersprache

mainProgrammingLanguagesByName <- DBData %>%
  group_by(name,programmingLanguage)%>%
  filter(programmingLanguage!="unknown")%>%
  summarise(cnt = n())%>%
  arrange(name,desc(cnt))%>%
  group_by(name)%>%
  summarise(mainLang = first(programmingLanguage))
knitr::kable(mainProgrammingLanguagesByName)
name mainLang
Cassandra Java
Couchbase Ruby
CouchDB Erlang
Elasticsearch Java
etcd Go
Firebird C
Greenplum C
Hazelcast Java
HBase Java
Hive Java
Impala C++
InfluxDB Go
MariaDB C
Memcached C
MongoDB C++
MySQL C++
Neo4j Java
PostgreSQL C
Presto Java
Realm Objective-C++
Redis C
Solr Java
Spark SQL Scala
Sphinx C++
SQLite C
DBDataFilteredByMainLang <- DBData %>%
  inner_join(mainProgrammingLanguagesByName,by="name") %>%
  filter(programmingLanguage==mainLang)

commit_table_mean_main <- DBDataFilteredByMainLang %>%
  group_by(name,commit) %>%
  summarise(commitSize = n()) %>%
  summarise(mean = mean(commitSize)) 

PlotMeanCommitSizeMainLanguage <- ggplot(commit_table_mean_main, aes(x = reorder(name, -mean),y=mean)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Durchschnittliche Commitgröße MainLanguage") +
  ylab("Commitgröße") + 
  xlab("Datenbanken")

file_table_mean_main <- DBDataFilteredByMainLang %>%
  group_by(name,commit) %>%
  summarise(fileSize = sum(change))%>%
   summarise(mean = mean(fileSize)) 
            

PlotMeanFileSizeMainLanguage <- ggplot(file_table_mean_main, aes(x = reorder(name, -mean),y=mean)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Durchschnittliche Dateigröße MainLanguage") +
  ylab("Lines of Code") + 
  xlab("Datenbanken")

commit_file_mean_main <- commit_table_mean_main %>%
  inner_join(file_table_mean_main,by="name")


PlotCompareCommitFileSizeMainLanguage  <- ggplot(commit_file_mean_main, aes(x=mean.x, y=mean.y,color=name)) +
  geom_point()+
  geom_label_repel(aes(label=name),hjust=-0.1, vjust=1)+
  ggtitle("Commitgröße - Filegröße MainLanguage") +
  ylab("Filegröße Mean") + 
  xlab("Commitgröße Mean")+ theme(legend.position="none")

change_commit_table_main <- DBDataFilteredByMainLang %>%
  group_by(file,programmingLanguage,name) %>%
  summarise(sum = sum(abs(change)),commitCnt = n())

PlotCommitCntFileSizeFacetWrapMainLanguage <- ggplot(change_commit_table_main, aes(x=sum,y=commitCnt,color=name)) +
  geom_jitter() +
  xlim(0,25000) +
  ylim(0,1000) +
  facet_wrap(~name)+
  ggtitle("Anzahl Commits - Menge Änderungen MainLanguage") +
  xlab("Änderungen in Zeilen") + 
  ylab("Anzahl Commits")

corr_table_main <- change_commit_table_main %>%
  group_by(name) %>%
  summarise(Correlation = cor(sum,commitCnt))

PlotCorrelationCommitFileMainLanguage <- ggplot(corr_table_main, aes(x=reorder(name,-Correlation),y=Correlation)) +
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 90))+
  ggtitle("Korrelation zwischen Anzahl Zeilen und Anzahl Commits MainLanguage") +
  ylab("Korrelation") + 
  xlab("Datanbank")

require(gridExtra)
grid.arrange(PlotMeanCommitSizeMainLanguage, PlotMeanCommitSize, ncol=2)

grid.arrange(PlotMeanFileSizeMainLanguage, PlotMeanFileSize, ncol=2)

grid.arrange(PlotCompareCommitFileSizeMainLanguage, PlotCompareCommitFileSize, ncol=2)

grid.arrange(PlotCommitCntFileSizeFacetWrapMainLanguage, PlotCommitCntFileSizeFacetWrap, ncol=2)
## Warning: Removed 330 rows containing missing values (geom_point).
## Warning: Removed 20287 rows containing missing values (geom_point).

grid.arrange(PlotCorrelationCommitFileMainLanguage, PlotCorrelationCommitFile, ncol=2)